In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
sns.__version__


Out[2]:
'0.8.1'

In [3]:
np.random.seed(42) # setting a random seed

feeding some data


In [4]:
data0 = np.random.randn(200)

mean, cov = [0, 1], [(1, .5), (.5, 1)]
data1 = np.random.multivariate_normal(mean, cov, 200)
df = pd.DataFrame(data1, columns=["x", "y"])

iris = sns.load_dataset("iris")
titanic = sns.load_dataset("titanic")
tips = sns.load_dataset("tips")
tips["big_tip"] = (tips.tip / tips.total_bill) > .15
anscombe = sns.load_dataset("anscombe")

1. Visualizing the distribution of a data

1.1 Univariate

distplot - draw a histogram along with fitting a Kernel Density Estimate (KDE)


In [5]:
sns.distplot(data0, bins=5, 
             hist=True, # whether to show histogram or not
             kde=True, # whether to fit KDE line or not <=> alternate of kdeplot
             rug=True)  # vertical lines on each observation <=> alternate of rugplot


Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa6e55a9588>

In [6]:
sns.rugplot(data0)
sns.kdeplot(data0,shade=True)


Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa6d62b8940>

In [7]:
# we can also compare our given data to the default distribution data using distplot as shown
sns.distplot(data0, 
             kde=False, # better to keep it False to see the effect of real fit
             fit=sp.stats.norm)


Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa6d5fd4eb8>

1.2 Bivariate

Scatter plot - using jointplot
Hexbin plot - shows the count of observations that fall within hexagonal bins
KDE plot - contour plot

Scatter plot


In [8]:
sns.jointplot(x="x", y="y", data=df)


Out[8]:
<seaborn.axisgrid.JointGrid at 0x7fa6d5ff7cc0>

Hexbin plot


In [9]:
sns.jointplot(x="x", y="y", data=df, kind="hex")


Out[9]:
<seaborn.axisgrid.JointGrid at 0x7fa6d6577ef0>

KDE plot


In [10]:
sns.jointplot(x="x", y="y", data=df, kind='kde')


Out[10]:
<seaborn.axisgrid.JointGrid at 0x7fa6d5d1db00>

In [11]:
# more dense contour KDE plot
cmap = sns.cubehelix_palette(as_cmap=True, dark=0, light=1, reverse=True)
sns.kdeplot(df.x, df.y, cmap=cmap, n_levels=60, shade=True);



In [12]:
# Jointplot uses JointGrid (saved in variable jg) through which we can perform many other variations as shown
jg = sns.jointplot(x="x", y="y", data=df, kind="kde", color="m")

jg.plot_joint(plt.scatter, c="w", s=30, marker="+")
jg.ax_joint.collections[0].set_alpha(0)
jg.set_axis_labels("$X-axis$", "$Y-axis$");


1.3 Pairwise relationship

pairplot


In [13]:
sns.pairplot(iris)


Out[13]:
<seaborn.axisgrid.PairGrid at 0x7fa6d5959f98>

2. Categorical data

2.1 Categorical scatter plot

stripplot - provides scatter plot of quantitative vs qualitative data
swarmplot - provides scatter plot of quantitative vs qualitative data (by overcoming duplicates)

Stripplot


In [14]:
sns.stripplot(x="day", y="total_bill", data=tips)


Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa6d519b358>

In [15]:
# overcoming the duplicates by introducing some randomness
sns.stripplot(x="day", y="total_bill", data=tips, jitter=True)


Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa6d5c6b0b8>

Swarmplot


In [16]:
sns.swarmplot(x="day", y="total_bill", data=tips) # removes duplicates without random data; uses some algo


Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa6d5a59da0>

In [17]:
sns.swarmplot(x="day", y="total_bill", hue="time", data=tips) # multiple data on same plot


Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa6d5b5d9b0>

2.2 Distributions of observations within categories

Boxplot
Violinplot - combine boxplot + KDE plot

boxplot


In [18]:
sns.boxplot(x="day", y="total_bill", hue="time", data=tips) # multiple data on same plot


Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa6d5bc7518>

violin plot


In [19]:
sns.violinplot(x="day", y="total_bill", hue="time", data=tips, 
               split=True,  # Mark True to join both the category plots
               inner="stick") # Shows original observations


Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa6d467d438>

combination of violin plot and swarm plot can give more interactive results


In [20]:
sns.violinplot(x="day", y="total_bill", data=tips, inner=None)
sns.swarmplot(x="day", y="total_bill", data=tips, color="k", alpha=.5);


2.3 Statistical estimation within categories

Barplot
Countplot - to show freq. of each category
Pointplot - shows a point connectivity among the categories along with confidence interval estimates

barplot


In [21]:
sns.barplot(x="day", y="total_bill", hue="time", data=tips)


Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa6d5b95438>

countplot


In [22]:
sns.countplot(x="deck", data=titanic, palette="Blues")


Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa6d466ffd0>

point plot


In [23]:
sns.pointplot(x="day", y="total_bill", hue="time", data=tips)


Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa6d42a2da0>

3. Visualizing linear relationship

lmplot - for linear, polynomial and logistic regression
residplot - for residual plot

lmplot


In [24]:
# consider fitting a linear regression on a quantitative vs qualitative data as shown
sns.lmplot(x="size", y="tip", data=tips, 
           x_jitter=.05)  # little randomness can be added to make the distribution 
                        # of those values much clear without changing the reg. line


Out[24]:
<seaborn.axisgrid.FacetGrid at 0x7fa6d4396e10>

In [25]:
# or we can just apply an estimator (e.g. mean)
sns.lmplot(x="size", y="tip", data=tips, x_estimator=np.mean)


Out[25]:
<seaborn.axisgrid.FacetGrid at 0x7fa6d41e3eb8>

In [26]:
# Fitting a linear regression line
sns.lmplot(x="x", y="y", data=anscombe.query("dataset == 'I'"),
           fit_reg=True,  # Whether to draw the line or not!!
           ci=None,  # Allow confidence interval!!
           scatter_kws={"s": 80})  # size of points


Out[26]:
<seaborn.axisgrid.FacetGrid at 0x7fa6d4150a90>

In [27]:
# Fitting a polynomial regression curve
sns.lmplot(x="x", y="y", data=anscombe.query("dataset == 'II'"),
           order=2, # define your order here           
           scatter_kws={"s": 80});



In [28]:
# Overcoming outliers by building a robust fit line 
# (It uses a different loss function to downweight relatively large residuals)
sns.lmplot(x="x", y="y", data=anscombe.query("dataset == 'III'"),
           robust=True, ci=None, scatter_kws={"s": 80})


Out[28]:
<seaborn.axisgrid.FacetGrid at 0x7fa6d4042a20>

In [29]:
# Fitting a logistic regression curve
sns.lmplot(x="total_bill", y="big_tip", data=tips,
           logistic=True, y_jitter=.03)


Out[29]:
<seaborn.axisgrid.FacetGrid at 0x7fa6cef36dd8>

In [30]:
# Multiple variable on same figure
sns.lmplot(x="total_bill", y="tip", hue="smoker", markers=["*","+"], palette="Set2", data=tips)


Out[30]:
<seaborn.axisgrid.FacetGrid at 0x7fa6cee91400>

In [31]:
# Different categories
sns.lmplot(x="total_bill", y="tip", hue="smoker", col="time", markers=["*","+"], palette="Set2", data=tips)


Out[31]:
<seaborn.axisgrid.FacetGrid at 0x7fa6cee74cc0>

residplot


In [32]:
# random pattern (instead of a pattern) across y=0 means linear regression is best suited.
sns.residplot(x="x", y="y", data=anscombe.query("dataset == 'I'"), scatter_kws={"s": 80})


Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa6cee08f98>